### Updates the sintax to run with R 3.0.0 and Zelig 4.1-3
### Changes sm to mw in the lables of the Figure 1

### Reads and analyzes 2002 and 2006 surveys in the file _replic_20022006.RData
### Estimates predicted vote for lula by income category according to each survey
### Loads estimates based on LAPOP
### Plots (Figures 1a and 1b) comparing estimates of lapop and other surveys
### Computes the relationship between HDI-M in each election in each survey
### Plots this relationship (Figure 2)
### Plots a tomorgraphy plot of the bounds of the ecological inferences from aggregate data 

### The main R data file used contains a list in which each element is a matrix with selected variables from public opinion surveys taken close to the 2002 and 2006 elections. Each survey is identified in the file, and includes the following variables: codeibge (official code of the municipality from which respondent was samples), vote (vote intention), income, partyID, educ (education level), sex, age,  region, surv (name and year of teh survey), hdi.2000 (human development index of municipality)

### The secondary R data file contains predicted probabilities of voting for lula in 2002 and 2006, according to LAPOP. This was computed in the same way as we computed the estimates for the other surveys, but because the data is propriatory, we provide only the actual results. 

### File _replic.R contains code to produce these estimates, provided the Lapop data is available
    
library(car)
library(survey)
library(Zelig)
rm(list=ls(all=TRUE))
#setwd("path to local folder")
load("_replic_20022006.RData")   

#### First, generate Figure 1a and 1b
#### Comparing predicted vote for lula in all six surveys
number.surveys  <- length(the.data)
vote.reg <- vote.evs <- income.props <- list() #objects to store results

for(i in number.surveys:1){  #loop runs regressions and stores results for plotting
	survey.name <- names(the.data)[i] 
	local.data <- the.data[[i]]
	#local.data$income <-car::recode(local.data$income,
	#		"c('02-03 sm','03-05 sm')='02-05 sm'")#use only 4 categories of income 
	controls <- c("sex","age","educ","region")
    main.var <- "income" 
    selector <- as.numeric(which(nrow(local.data)-
    			(apply(is.na(local.data[,controls])==FALSE,2,sum))<
    			.2*nrow(local.data)))  
	local.data$votedummy <- 0; local.data$votedummy[grep("lula",local.data$vote)]<-1
    the.reg <- as.formula(paste("as.factor(votedummy) ~",
                        main.var,"+",paste(controls[selector],collapse="+")))    
    cat("\n\n",survey.name,"\nEstimating the model with",
    		nrow(local.data),"observations,",1+length(selector),
        	"variables and DV with",length(levels(as.factor(local.data$votedummy))),"levels. 				Observations by variable: \n")
	print(as.matrix(apply(is.na(local.data[,c(controls,"votedummy",main.var)])==F,2,sum)))
    local.data <- na.omit(local.data[,c(controls,"votedummy",main.var,"we")])
    cat("After eliminating missing data, set has",nrow(local.data),"observations\n")
    flush.console()
    tm <- proc.time()
    options(warn=-1)
    z.vote <- zelig(the.reg, model = "logit", data = local.data, weights=local.data$we) 
    cat("Time elapsed:",((proc.time()-tm)[3])/60,"mins","\n")
    flush.console()
    vote.reg[[survey.name]]<-z.vote

	#Computing expected values
	evs.vote <- evs.vote.lb <- evs.vote.ub <- NULL
    for(k in 1:length(levels(local.data[,main.var]))){#do simulation and store values for categories of income
        x.out <- setx(z.vote, income=levels(local.data[,main.var])[k], 
        		region="SE", sex="f")
        evs <- summary(sim(z.vote, x = x.out))$stats[[1]]       				
        evs.vote <- rbind(evs.vote,evs[,1])
        evs.vote.lb <- rbind(evs.vote.lb,evs[,4])
        evs.vote.ub <- rbind(evs.vote.ub,evs[,5])
    }
    rownames(evs.vote) <- levels(local.data[,main.var])
    evs <- list(vote=evs.vote,ub=evs.vote.ub,lb=evs.vote.lb)
    vote.evs[[survey.name]]<-evs  

	#storing proportions in each income bracket
	income.props[[survey.name]] <- prop.table(table(local.data$income))
}


##### Build the plots  ################
tmp <- vote.evs
rownames(pe) <- gsub("sm","mw",rownames(pe))  #translate to english
load("predictedvoteLULA_LAPOP2007.Rdta")  #read in analogous results from LAPOP 2007
vote.evs <- c(tmp,vote.evs)
	### Assemble a single dataframe with all results
	all.cat <- sort(unique(do.call(c,sapply(vote.evs,function(x){rownames(x[[1]])}))))
	ats <- c(1,1,2,3,4,5,7,10)
	ats <- c(1,0.5,1.5,2.5,3.5,4,7.5,10)
	to.plot <- data.frame(x=ats)
	empty <- matrix(NA,nrow=nrow(to.plot),
			ncol=length(vote.evs),dimnames=list(c(NULL),names(vote.evs)))
	row.names(to.plot) <- all.cat
	pe <- ub <- lb <- cbind(to.plot,empty)
	for(i in 1:length(vote.evs)){
		pe[rownames(vote.evs[[i]][[1]]),names(vote.evs)[i]] <- vote.evs[[i]][[1]]
    	ub[rownames(vote.evs[[i]][[1]]),names(vote.evs)[i]] <- vote.evs[[i]][[2]]
    	lb[rownames(vote.evs[[i]][[1]]),names(vote.evs)[i]] <- vote.evs[[i]][[3]]
	}

jpeg(file="fig1a-vote2002.jpg",width=6,height=5,units="in",res=350)
    par(mar=c(4,4,1,2.5))
    plot(c(min(pe$x),max(pe$x)),c(0,1),type="n",ylab="Probability of Voting for Lula",
                    xlab="Income Bracket",bty="n",
                    main="",xaxt="n",cex.lab=1.2)
    axis(side=1,at=ats[c(1,4,6,7,8)],
    		labels=gsub(" ","",rownames(pe)[c(1,4,6,7,8)]),cex.axis=0.85)
    cols<-1
    for(j in grep("2002",names(pe))){
    	lines(na.omit(cbind(pe$x,pe[,j])),col=1,lwd=2)
    	polygon(na.omit(
    		rbind(cbind(ub$x,ub[,j]),
    		cbind(lb$x[nrow(lb):1],lb[nrow(lb):1,j]))),
    		density=50,angle=45,col=gray(cols*0.15),lty = 3,border=NULL)
    	cols<- cols+1
    }       
    lbl <- pe[nrow(pe),grep("2002",names(pe))]
    text(rep(max(pe$x),length(lbl)),
    		as.numeric(lbl)+c(-.016,.011,0,0),
    		gsub("-?\\d","",names(lbl)),pos=4,xpd=NA,cex=0.9)
  	dev.off()
   
 	jpeg(file="fig1b-vote2006.jpg",width=6,height=5,units="in",res=350)
    par(mar=c(4,4,1,2.5))
    plot(c(min(pe$x),max(pe$x)),c(0,1),type="n",ylab="Probability of Voting for Lula",
                    xlab="Income Bracket",bty="n",
                    main="",xaxt="n",cex.lab=1.2)
    axis(side=1,at=ats[c(1,4,6,7,8)],
    	labels=gsub(" ","",rownames(pe)[c(1,4,6,7,8)]),cex.axis=0.85)
    cols<-1
    for(j in grep("2006",names(pe))){
    	lines(na.omit(cbind(pe$x,pe[,j])),col=1,lwd=2)
    	polygon(na.omit(
    		rbind(cbind(ub$x,ub[,j]),
    		cbind(lb$x[nrow(lb):1],lb[nrow(lb):1,j]))),
    		density=50,angle=45,col=gray(cols*0.15),lty = 3,border=NULL)
    	cols<- cols+1
    }       
    lbl <- pe[nrow(pe),grep("2006",names(pe))]
    text(rep(max(pe$x),length(lbl)),
    	as.numeric(lbl)+c(.012,0,-0.012,0),
    	gsub("-?\\d","",names(lbl)),pos=4,xpd=NA,cex=0.9)
  	dev.off()


#### Generate Figure 2
#### Comparing hdi-m X vote for lula in four surveys for which we have hdi-m
the.data[[6]] <- NULL;the.data[[3]] <- NULL #exclude surveys without municipality data
number.surveys  <- length(the.data)
hdi.effects<- hdi.effects.int <- all.prop <- average <-  hdi.predict.int<- list()

### Compute relationship between HDI-M and vote for lula
for(i in number.surveys:1){
	local.data <- the.data[[i]]  
	survey.name <- names(the.data)[i] 
    mhdi <- median(local.data$hdi.2000,na.rm=T)
    cat("HDI-M median respondent:",median(local.data$hdi.2000,na.rm=T),"\n")
    cat("HDI-M median municipality:",median(unique(local.data$hdi.2000,na.rm=T)),"\n")
    local.data$highhdi <- local.data$hdi.2000>mhdi
    local.data$highinc <- is.element(local.data$income,c("05-10 sm","10+ sm","02-03 sm","02-05 sm","03-05 sm"))
    print(prop.table(table(local.data$highinc)))
    all.prop[[survey.name]] <- c(prop.table(table(local.data$highinc,local.data$highhdi)))
    average[[survey.name]]<-mean(local.data$vote=="lula",na.rm=T)
    tmp<-summary(lm(I(vote=="lula")~highhdi,data=local.data))
    hdi.effects[[survey.name]] <- tmp$coef
    tmp.2<-summary(lm(I(vote=="lula")~highhdi*highinc,data=local.data))
    tmp.2 <- lm(I(vote=="lula")~highhdi*highinc,data=local.data)
    new.data <- data.frame(highhdi=c(F,T,F,T),highinc=c(F,F,T,T))
    hdi.effects.int[[survey.name]] <- tmp.2$coef 
    hdi.predict.int[[survey.name]] <- predict(tmp.2,new.data,se.fit=T,interval="confidence")[[1]]
    }  
hdi <-  list(hdi.effects,hdi.effects.int, hdi.predict.int)
   
       
### THe Basic Figure in the Paper
my.points<- function(x,xc,yr.lty,surv){
    pe <- x[,1]
    ub <- x[,"upr"] 
    lb <- x[,"lwr"]
    tmp <-rbind(cbind(rep(xc[1],2),c(ub[1],lb[1])),
                c(NA,NA), 
                cbind(rep(xc[2],2),c(ub[2],lb[2])),
                c(NA,NA), 
                cbind(rep(xc[3],2),c(ub[3],lb[3])),
                c(NA,NA), 
                cbind(rep(xc[4],2),c(ub[4],lb[4])))  
    lines(tmp,lwd=2,col=1,lty=yr.lty)
    points(xc,pe,pch=surv,bg=gray(1),cex=1.2)
}

xc <- 1:4
off <- c(-.3,-.1,.1,.3)
all.xc <- list(xc+off[1],xc+off[2],xc+off[3],xc+off[4])
xlims <- c(0.5,4.5)
ylims <- c(0,1)

jpeg(file="fig2-voteHDI-M.jpg",width=6,height=5,units="in",res=350)
par(mar=c(5,3.5,2,1))
plot(xlims,y=ylims,type="n",ylab="",xlab="",xaxt="n",bty="n")
polygon(x=c(xlims[1],rep(xlims[2],2),xlims[1]),y= c(rep(ylims[1],2),rep(ylims[2],2)),col=gray(0.75),border=NA,cex.lab=1.2)
abline(v=seq(1.5,xlims[2],1),lty=1,col=gray(1),lwd=4)
abline(v=seq(1,xlims[2],1),lty=2,col=gray(1),lwd=1)
axis(side=1,line=-0.3,at=1:4,labels=rep("",4),tick=T,cex.axis=1.2)
axis(side=1,line=.9,at=1:4,labels=c("Poorest\nLow HDI-M","Poorest\nHigh HDI-M",
    "Non-Poor\nLow HDI-M","Non-Poor\nHigh HDI-M"),tick=F,cex.axis=1.1)
mtext("Vote for Lula",side=2,line=2.57,cex=1.3)

for(i in 1:4){my.points(hdi[[3]][[i]],xc=all.xc[[i]],yr.lty=c(1,1,2,2)[i],surv=(21:24)[i])}
axis(side=1,line=2,at=1:4,label=paste("(",round(apply(do.call(rbind,all.prop),2,mean)*100)[c(1,3,2,4)],"%)",sep="")
    ,tick=F,cex=1.1)
abline(h=mean(average[[1]],average[[2]]),col=1,lty=3)
abline(h=mean(average[[3]],average[[4]]),col=1,lty=3)
legend(x="top",legend=gsub("(.*\\D).*$","\\1",names(hdi[[3]])),
       bty="n",merge=T,lty=c(1,1,2,2), pch=21:24,pt.bg=0,bg="white",
       ,pt.cex=1.3,horiz=T,xpd=T,inset=c(0.01,-0.06),cex=1.1)
dev.off()
  
  
##### Tommorgraphy plots for the change, in the webappendix
library(MCMCpack)
tmp <-sapply(hdi[[3]],function(x){x[,1]})
tmp2 <- round(data.frame(m2002=apply(tmp[,c(1,2)],1,mean),m2006=apply(tmp[,c(3,4)],1,mean))*100) #mean of surveys for same year, by each categoy

if(.Platform$OS.type=="unix"){quartz(title="Tomography 1",width=10,height=10)
						}else{ windows(10,10)}
par(mar=c(4,4,1,1))
tomogplot(100-tmp2[,1], tmp2[,1], 100-tmp2[,2], tmp2[,2], 
            xlab="Share of 2002 Non-Supporters that Continued Non-Supporters in 2006",
          ylab="Share of Supporters 2002 that Become Non-Supporters in 2006", lwd=2,col=1:4)   
text(c(.3,.3,.8,.5),c(.3,.8,.4,.8),labels=c("Poorest\nLow HDI-M","",
    "","Non-Poor\nHigh HDI-M"),pos=c(2,2,4,4))

if(.Platform$OS.type=="unix"){quartz(title="Tomography 2",width=10,height=10)
						}else{ windows(10,10)}
tomogplot(tmp2[,1], 100-tmp2[,1], tmp2[,2], 100-tmp2[,2], 
            xlab="Share of 2002 Supporters that Remained  Supporters in 2006",
          ylab="Share of Non Supporters in 2002 that Became Supporters in 2006", lwd=2)   
text(c(.6,.3,.8,.35),c(.8,.8,.4,.4),labels=c("Poorest\nLow HDI-M","",
    "","Non-Poor\nHigh HDI-M"),pos=c(4,4,2,2))


